library(data.table)
library(arm)
library(rstan)
library(countrycode)
library(stargazer)
library(foreign)
library(Amelia)
load("migration_model_data.RData")
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
set.seed(1)

# make null model
vars <- c("log_flow", "iso3_o", "iso3_d", "year", "dyad")
d <- bilateral_migration_data[, vars, with = FALSE]
d <- na.omit(d)
ref_table <- d[, .(dyad, year, log_flow)]
y <- d$log_flow
year <- as.numeric(factor(d$year))
iso3_d <- as.numeric(factor(d$iso3_d))
iso3_o <- as.numeric(factor(d$iso3_o))
d[, int:=1]
d[, c("log_flow", "iso3_o", "iso3_d", "year", "dyad") := NULL]
X <- as.matrix(d)
dl <- list(
  K = ncol(X),
  N = nrow(X),
  y_obs = y,
  X = X,
  N_d = max(iso3_d),
  N_o = max(iso3_o),
  i_d = iso3_d,
  i_o = iso3_o,
  N_year = max(year),
  i_year = year
)
sm <- stan_model("weibull_hurdle_6.stan")

sf_null <- vb(sm, data = dl, output_samples = 10000)
ex_null <- rstan::extract(sf_null, pars = c("resids"))
bias_null <- apply(ex_null$resids, 2, median)
mse <- function(x)
{
  mean(x^2)
}
mse(bias_null) # MSE for comparing with full baseline model


##########################
# APPX D Robustness Checks
##########################
# make model without stocks
vars <- c("log_flow", "distw", "contig", "colony", "comlang_off",
  "pop_d", "log_gdp_dif", "urate_d", "comcur", "lib_dem_o", "lib_dem_d",
  "year_ed_15_plus_o", "iso3_o", "iso3_d", "year", "region_o", 
  "region_d", "dyad", "gdpcap_o", "gdpcap_d", "cw_o")
d <- bilateral_migration_data[, vars, with = FALSE]
d <- na.omit(d)
ref_table <- d[, .(dyad, year, log_flow)]
d$log_dist_w <- log(d$distw)
d$log_pop_d <- log(d$pop_d)
d$log_urate_d <- log(d$urate_d)
d$log_gdpcap_o <- log(d$gdpcap_o)
d$log_gdpcap_d <- log(d$gdpcap_d)
d$log_gdpcap_o2 <- d$log_gdpcap_o^2
d$log_gdpcap_d2 <- d$log_gdpcap_d^2
d$cw_o <- ifelse(is.na(d$cw_o), 0, d$cw_o)
y <- d$log_flow
year <- as.numeric(factor(d$year))
iso3_d <- as.numeric(factor(d$iso3_d))
iso3_o <- as.numeric(factor(d$iso3_o))
d[, c("log_flow", "iso3_o", "iso3_d", "year", "region_o", "region_d",
      "pop_d", "urate_d", "distw", "dyad", "gdpcap_o", "gdpcap_d") := NULL]
X <- as.matrix(d)
for(k in 1:ncol(d)){
  X[, k] <- arm::rescale(X[, k], binary.inputs = "0/1")
}
X <- cbind(1, X)
X <- apply(X, 2, as.numeric)
dl <- list(
  K = ncol(X),
  N = nrow(X),
  y_obs = y,
  X = X,
  N_d = max(iso3_d),
  N_o = max(iso3_o),
  i_d = iso3_d,
  i_o = iso3_o,
  N_year = max(year),
  i_year = year
)
sf_nostocks <- vb(sm, data = dl, output_samples = 10000)
ex_nostocks <- rstan::extract(sf_nostocks, pars = c("resids", "beta"))
bias <- apply(ex_nostocks$resids, 2, median)
ref_table$bias <- bias
ref_table$flow <- ref_table$log_flow
bias_dat <- ref_table
create_bias_measures <- function(make_bias_dat)
{  
  europe <- c("Northern Europe", "Southern Europe", "Western Europe", 
    "Eastern Europe")
  EU95 <- c("Belgium", "France", "Netherlands", "Italy", "Luxembourg", 
    "Germany", "Denmark", "Ireland", "United Kingdom", "Greece", "Portugal",
    "Spain", "Austria", "Finland", "Sweden")
  EU_CEE <- c("Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia",
    "Lithuania", "Malta", "Poland", "Slovakia", "Slovenia", 
    "Bulgaria", "Romania")
  subsaharan <- c("Eastern Africa", "Western Africa", "Southern Africa",
                  "Middle Africa")
  oceania <- c("Micronesia", "Polynesia", "Melanesia", "Australia and New Zealand")
  namerica <- c("Northern America", "Caribbean")
  oecd <- c("Austria", "Belgium", "Canada", 
    "Denmark",  "Finland", "France", "Germany", "Greece", 
    "Iceland", "Ireland", "Italy", "Japan", "Luxembourg",
    "Mexico", "Netherlands", "New Zealand", "Norway", "Portugal",
    "Spain", "Sweden", "Switzerland",
    "United Kingdom", "United States")
  north <- c("Austria", "Australia", "Belgium", "Bulgaria", "Canada", "Cyprus", 
  "Denmark", "Finland", "France", "Germany", "Greece", "Hungary", "Iceland", 
  "Ireland", "Italy", "Liechtenstein", "Malta",
  "Japan", "Luxembourg", "Netherlands", "New Zealand", "Norway",
  "Poland", "Portugal", "Romania", "Russian Federation", "Spain", "Sweden", 
  "Switzerland", "United Kingdom", "United States", "Turkey", "Albania", 
  "Andorra", "Estonia", "Latvia", "Lithuania", "Armenia", 
  "Moldova, Republic of", "Croatia", "Slovenia", 
  "Bosnia and Herzegovina", "Czech Republic", "Slovakia", 
  "Macedonia, the former Yugoslav Republic of", "Serbia", "Montenegro")
  #south <- setdiff(unique(bias_dat$origin_name), north)
  bias_dat <- make_bias_dat
  bias_dat[, origin := substr(bias_dat$dyad, 1, 3)]
  bias_dat[, dest := substr(bias_dat$dyad, 5, 7)]
  bias_dat[, origin_name := countrycode(origin, "iso3c", "country.name")]
  bias_dat[, dest_name := countrycode(dest, "iso3c", "country.name")]
  bias_dat[, region_o := countrycode(origin, "iso3c", "region")]
  bias_dat[, region_d := countrycode(dest, "iso3c", "region")]
  bias_dat$region_map <- bias_dat$region_d
  bias_dat$north_south <- bias_dat$region_o
  bias_dat$original_origins <- bias_dat$region_o
  bias_dat$north_south_o[bias_dat$origin_name %in% north] <- "Global North"
  bias_dat$north_south_o[bias_dat$origin_name %in% 
    setdiff(unique(bias_dat$origin_name), north)] <- "Global South"
  bias_dat$north_south_d[bias_dat$dest_name %in% north] <- "Global North"
  bias_dat$north_south_d[bias_dat$dest_name %in% 
    setdiff(unique(bias_dat$dest_name), north)] <- "Global South"
  bias_dat$region_o[bias_dat$origin_name %in% oecd] <- "OECD"
  bias_dat$region_o[bias_dat$region_o %in% subsaharan] <- "Sub-Saharan Africa"
  bias_dat$region_o[bias_dat$region_o %in% oceania] <- "Oceania"
  bias_dat$region_o[bias_dat$region_o %in% namerica] <- "North America"
  bias_dat$region_o[bias_dat$region_o %in% europe] <- "Europe"
  bias_dat$region_d[bias_dat$region_d %in% europe] <- "Europe"
  bias_dat$region_d[bias_dat$region_d %in% subsaharan] <- "Sub-Saharan Africa"
  bias_dat$region_d[bias_dat$region_d %in% oceania] <- "Oceania"
  bias_dat$region_d[bias_dat$region_d %in% namerica] <- "North America"
  bias_dat$region_d[bias_dat$dest_name %in% oecd] <- "OECD"
  bias_dat$region_map[bias_dat$region_map %in% europe] <- "Europe"
  bias_dat$region_map[bias_dat$region_map %in% subsaharan] <- "Sub-Saharan Africa"
  bias_dat$region_map[bias_dat$region_map %in% oceania] <- "Oceania"
  bias_dat$region_map[bias_dat$region_map %in% namerica] <- "North America"
  bias_dat[, region_bias := mean(bias), by = .(dest_name, region_o, year)]
  bias_dat[, dwloss := ifelse(bias < 0, -exp(abs(bias)), exp(bias))]
  bias_dat[, plot_years := paste(as.numeric(year) + 1, "-", 
                                 as.numeric(year) + 5, sep = "")]
  bias_dat[, plot_regions := paste0("From: ", region_o)]
}

bias_dat <- create_bias_measures(bias_dat)

# replicate below here
load("attribution_data.RData")
eth_dat <- read.dta("AER20040333_5y.dta")

setDT(eth_dat)
eth_dat <- eth_dat[period == 8, ETHLRGPRIMEXP / PRIMEXP, by = countryname]
eth_dat[, V1 := ifelse(is.na(V1), 1, V1)]
setnames(eth_dat, c("origin", "largest_eth_pct"))
eth_dat[, iso3c := countrycode::countrycode(origin, "country.name", "iso3c")]
eth_dat[, origin := NULL]
attr_data <- merge(attr_data, eth_dat, by = "iso3c")

gen_dist <- foreign::read.dta("genetic_distance.dta")
setDT(gen_dist)  
gen_dist$origin <- 
  countrycode::countrycode(gen_dist$country_1, "country.name", "iso3c")
gen_dist$dest <- 
  countrycode::countrycode(gen_dist$country_2, "country.name", "iso3c")
gen_dist <- gen_dist[, .(fst_distance_weighted, origin, dest)]  
setDT(gen_dist)
bias_dat <- merge(bias_dat, gen_dist, by = c("origin", "dest"))



gen_dist <- bias_dat[flow > 0, 
  mean(fst_distance_weighted, na.rm = TRUE), by = "origin"]
setnames(gen_dist, c("origin", "avg_gen_dist"))
attr_data <- merge(attr_data, gen_dist, by.x = "iso3c", by.y = c("origin"))


attr_data$z_hc <- rescale(attr_data$hc)
attr_data$z_bias <- rescale(attr_data$bias)
attr_data$z_approx <- rescale(attr_data$approx)
attr_data$z_lib_democracy <- rescale(attr_data$lib_democracy)
attr_data$z_gini <- rescale(attr_data$gini)
attr_data$z_civil_lib <- rescale(attr_data$civil_liberties)
attr_data$z_log_pop <- rescale(attr_data$log_pop)
attr_data$z_log_gdpc <- rescale(attr_data$log_gdp_capita)
attr_data$z_growth_rate <- rescale(attr_data$gdp_growth_rate)
attr_data$z_rconna <- rescale(attr_data$rconna * 1e6)
attr_data$z_ck <- rescale(attr_data$log_cap_stock)
attr_data$z_ed <- rescale(attr_data$education_15_plus)
attr_data$z_remit <- rescale(attr_data$log_remittances)
attr_data[, hc_lag := shift(hc, n = 5), by = origin]
attr_data[, five_years := ifelse(year %in% seq(1990, 1995, 1), 1,
  ifelse(year %in% seq(1996, 2000, 1), 2,
    ifelse(year %in% seq(2001, 2005, 1), 3, 4)))] 
attr_data[, five_years_num := ifelse(year %in% seq(1990, 1995, 1), 1990,
  ifelse(year %in% seq(1996, 2000, 1), 1995,
    ifelse(year %in% seq(2001, 2005, 1), 2000, 2005)))] 
attr_data[, year_flow := log(sum(log_flow)), by = .(origin, year)]
attr_data[, pc_year_flow := log(sum(log_flow)) / pop, by = .(origin, year)]

skin <- read.csv("skincolor.csv")
bias_dat <- merge(bias_dat, skin, by.x = "origin_name", by.y = "country")
attr_data <- merge(attr_data, skin, by.x = "origin", by.y = "country")
bias_dat[, north := ifelse(medical %in% c("Chinese", "Japanese", "White"), 1, 0)]
bias_dat[, black := ifelse(!(medical %in% c("Black")), 0, 1)]
attr_data[, black := ifelse(!(medical %in% c("Black")), 0, 1)]
attr_data[, north := ifelse(medical %in% c("Chinese", "Japanese", "White"), 1, 0)]
attr_data[, white := ifelse(medical %in% c("Chinese", "Japanese", "White"), 1, 0)]

oecd <- dput(as.character(unique(attr_data[oecd == 1, origin])))
bias_dat[, oecd := ifelse(origin_name %in% oecd, 1, 0)]
bias_dat[, flow := exp(flow)]

dist <- 
  bilateral_migration_data[, mean(distw, na.rm = TRUE), 
  by = origin_name]
setnames(dist, c("origin", "mean_log_dist"))
attr_data <- merge(attr_data, dist, by = "origin")

attr_data[, emp_rate := emp / pop, by = .(origin, year)]
mod_dat <- attr_data[, c("black", "log_gdp_capita", "hc", "emp_rate",
  "mean_log_dist", "lib_democracy", "gini", 
  "civil_liberties", "inflation_rate", "north", "oecd", "gdp_growth_rate",
  "education_15_plus", "approx", "white", "civil_war", "region",
  "year_flow", "pc_year_flow", "pop", "avg_gen_dist", "largest_eth_pct"), 
  with = FALSE]
mod_dat <- as.data.frame(apply(mod_dat, 2, rescale, binary.inputs = "0/1"))
mod_dat$bias <- attr_data$bias
mod_dat$year <- attr_data$year
mod_dat$iso3c <- attr_data$iso3c
mod_dat$region <- attr_data$new_region
mod_dat$race <- attr_data$medical

mod_dat$black_regions <- as.character(attr_data$medical)
mod_dat$test_regions <- as.character(mod_dat$black_regions)
mod_dat$test_regions <- ifelse(mod_dat$test_regions == "Chinese" | 
  mod_dat$test_regions == "Japanese", "East Asian", mod_dat$test_regions)
mod_dat$test_regions <- ifelse(mod_dat$test_regions == "Philipino",
  "Southeast Asian", mod_dat$test_regions)
mod_dat$test_regions <- ifelse(mod_dat$test_regions == "White" &
  mod_dat$oecd == 1, "OECD, White", ifelse(mod_dat$test_regions == "White" &
  mod_dat$oecd == 0, "Non-OECD, White", mod_dat$test_regions))
mod_dat$black_oecd <- ifelse(mod_dat$test_regions == "Black",
  "Black", ifelse(mod_dat$test_regions == "OECD, White", "OECD",
  "Other"))
mod_dat$test_regions <- as.factor(mod_dat$test_regions)
mod_dat$test_regions <- relevel(mod_dat$test_regions, ref = "Black")

mod1_simp <- lm(bias ~ black + 
  iso3c, 
  data = mod_dat)

mod1_regions <- lm(bias ~ test_regions + 
  iso3c, 
  data = mod_dat)
stargazer::stargazer(mod1_simp, mod1_regions,
  omit = c("iso3c", "factor"), style = "apsr",
  covariate.labels = c("Black", "Arab", "East Asian",
  "Latin American", "Non-OECD, White", "OECD, White",
  "South Asian", "Southeast Asian",
  "West Asian", "(Intercept)"), ci = TRUE,
  digits = 2, align = TRUE,
  add.lines = list(c("Country FE", "\\checkmark", "\\checkmark")),
  dep.var.labels = "Deviation",
  ci.level = 0.95,
  star.cutoffs = c(0.05, 0.01, 0.001))

# Now genetic distance model


gen_dist <- read.dta("genetic_distance.dta")
setDT(gen_dist)  
gen_dist$origin <- 
  countrycode::countrycode(gen_dist$country_1, "country.name", "iso3c")
gen_dist$dest <- 
  countrycode::countrycode(gen_dist$country_2, "country.name", "iso3c")
gen_dist <- gen_dist[, .(fst_distance_weighted, origin, dest)]  
setDT(gen_dist)
test_dat <- merge(bias_dat, gen_dist, by = c("origin", "dest"))

test_dat$gen_dist <- arm::rescale(test_dat$fst_distance_weighted.x)
mod1 <- lm(bias ~ gen_dist,
  data = test_dat)
mod2 <- lm(bias ~ gen_dist +
  origin +
  dest,
  data = test_dat)
# make origin-period, dest-period factor
test_dat$origin_pd <- paste0(test_dat$origin, test_dat$plot_years)
test_dat$dest_pd <- paste0(test_dat$dest, test_dat$plot_years)
mod3 <- lm(bias ~ gen_dist +
  origin +
  dest +
  origin_pd +
  dest_pd,
  data = test_dat)
summary(mod3)
stargazer::stargazer(mod1, mod2, mod3, style = "apsr",
  covariate.labels = c("Allele Distance"), ci = TRUE,
  digits = 2, omit = c("dest", "origin", "year"), align = TRUE,
  add.lines = list(c("Origin FE", "\\textnormal{X}", "\\checkmark", "\\checkmark"),
  c("Destination FE", "\\textnormal{X}", "\\checkmark", "\\checkmark"),
  c("Origin-Period FE", "\\textnormal{X}", "\\textnormal{X}", "\\checkmark"),
  c("Destination-Period FE", "\\textnormal{X}", "\\textnormal{X}", "\\checkmark")),
  dep.var.labels = "Deviation",
  ci.level = 0.95,
  star.cutoffs = c(0.05, 0.01, 0.001))

##############
# APPX E BELOW
##############

load("attribution_data.RData")
load("mig_analysis_data.RData")
eth_dat <- read.dta("AER20040333_5y.dta")
setDT(eth_dat)
eth_dat <- eth_dat[period == 8, ETHLRGPRIMEXP / PRIMEXP, by = countryname]
eth_dat[, V1 := ifelse(is.na(V1), 1, V1)]
setnames(eth_dat, c("origin", "largest_eth_pct"))
eth_dat[, iso3c := countrycode::countrycode(origin, "country.name", "iso3c")]
eth_dat[, origin := NULL]
attr_data <- merge(attr_data, eth_dat, by = "iso3c")
gen_dist <- read.dta("genetic_distance.dta")
setDT(gen_dist)
gen_dist$origin <-
  countrycode::countrycode(gen_dist$country_1, "country.name", "iso3c")
gen_dist$dest <-
  countrycode::countrycode(gen_dist$country_2, "country.name", "iso3c")
gen_dist <- gen_dist[, .(fst_distance_weighted, origin, dest)]
setDT(gen_dist)
bias_dat <- merge(bias_dat, gen_dist, by = c("origin", "dest"))
gen_dist <- bias_dat[flow > 0,
  mean(fst_distance_weighted, na.rm = TRUE), by = "origin"]
setnames(gen_dist, c("origin", "avg_gen_dist"))
attr_data <- merge(attr_data, gen_dist, by.x = "iso3c", by.y = c("origin"))
attr_data$z_hc <- rescale(attr_data$hc)
attr_data$z_bias <- rescale(attr_data$bias)
attr_data$z_approx <- rescale(attr_data$approx)
attr_data$z_lib_democracy <- rescale(attr_data$lib_democracy)
attr_data$z_gini <- rescale(attr_data$gini)
attr_data$z_civil_lib <- rescale(attr_data$civil_liberties)
attr_data$z_log_pop <- rescale(attr_data$log_pop)
attr_data$z_log_gdpc <- rescale(attr_data$log_gdp_capita)
attr_data$z_growth_rate <- rescale(attr_data$gdp_growth_rate)
attr_data$z_rconna <- rescale(attr_data$rconna * 1e6)
attr_data$z_ck <- rescale(attr_data$log_cap_stock)
attr_data$z_ed <- rescale(attr_data$education_15_plus)
attr_data$z_remit <- rescale(attr_data$log_remittances)
attr_data[, hc_lag := shift(hc, n = 5), by = origin]
attr_data[, five_years := ifelse(year %in% seq(1990, 1995, 1), 1,
  ifelse(year %in% seq(1996, 2000, 1), 2,
    ifelse(year %in% seq(2001, 2005, 1), 3, 4)))]
attr_data[, five_years_num := ifelse(year %in% seq(1990, 1995, 1), 1990,
  ifelse(year %in% seq(1996, 2000, 1), 1995,
    ifelse(year %in% seq(2001, 2005, 1), 2000, 2005)))]
attr_data[, year_flow := log(sum(log_flow)), by = .(origin, year)]
attr_data[, pc_year_flow := log(sum(log_flow)) / pop, by = .(origin, year)]

load("migration_model_data.RData")
skin <- read.csv("skincolor.csv")
attr_data <- merge(attr_data, skin, by.x = "origin", by.y = "country")
attr_data[, black := ifelse(!(medical %in% c("Black")), 0, 1)]
attr_data[, north := ifelse(medical %in% c("Chinese", "Japanese", "White"), 1, 0)]
attr_data[, white := ifelse(medical %in% c("Chinese", "Japanese", "White"), 1, 0)]


dist <-
  bilateral_migration_data[, mean(distw, na.rm = TRUE),
  by = origin_name]
setnames(dist, c("origin", "mean_log_dist"))
attr_data <- merge(attr_data, dist, by = "origin")

attr_data[, emp_rate := emp / pop, by = .(origin, year)]
mod_dat <- attr_data[, c("black", "log_gdp_capita", "hc", "emp_rate",
  "mean_log_dist", "lib_democracy", "gini",
  "civil_liberties", "inflation_rate", "north", "oecd", "gdp_growth_rate",
  "education_15_plus", "approx", "white", "civil_war", "region",
  "year_flow", "pc_year_flow", "pop", "avg_gen_dist", "largest_eth_pct"),
  with = FALSE]
mod_dat <- as.data.frame(apply(mod_dat, 2, rescale, binary.inputs = "0/1"))
mod_dat$bias <- attr_data$bias
mod_dat$year <- attr_data$year
mod_dat$iso3c <- attr_data$iso3c
mod_dat$region <- attr_data$new_region
mod_dat$race <- attr_data$medical

mod_dat$black_regions <- as.character(attr_data$medical)
mod_dat$test_regions <- as.character(mod_dat$black_regions)
mod_dat$test_regions <- ifelse(mod_dat$test_regions == "Chinese" |
  mod_dat$test_regions == "Japanese", "East Asian", mod_dat$test_regions)
mod_dat$test_regions <- ifelse(mod_dat$test_regions == "Philipino",
  "Southeast Asian", mod_dat$test_regions)
mod_dat$test_regions <- ifelse(mod_dat$test_regions == "White" &
  mod_dat$oecd == 1, "OECD, White", ifelse(mod_dat$test_regions == "White" &
  mod_dat$oecd == 0, "Non-OECD, White", mod_dat$test_regions))
mod_dat$black_oecd <- ifelse(mod_dat$test_regions == "Black",
  "Black", ifelse(mod_dat$test_regions == "OECD, White", "OECD",
  "Other"))
mod_dat$test_regions <- as.factor(mod_dat$test_regions)
mod_dat$test_regions <- relevel(mod_dat$test_regions, ref = "Black")
setDT(mod_dat)
mod_dat[, `:=`(
  emp_rate_lag = shift(emp_rate, 5),
  year_flow_lag = shift(year_flow, 5),
  avg_gen_dist_lag = shift(avg_gen_dist, 5),
  largest_eth_pct_lag = shift(largest_eth_pct, 5)
)]

mod1_simp <- lm(bias ~ black +
  factor(year) +
  iso3c,
  data = mod_dat)
setDT(mod_dat)

mod1 <- lm(bias ~ black +
  emp_rate_lag +
  year_flow_lag +
  avg_gen_dist_lag +
  largest_eth_pct_lag +
  factor(year) +
  iso3c,
  data = mod_dat)


mod1_regions <- lm(bias ~ test_regions +
  emp_rate +
  year_flow +
  avg_gen_dist +
  largest_eth_pct +
  factor(year) +
  iso3c,
  data = mod_dat)

stargazer::stargazer(mod1_simp, mod1, mod1_regions,
  omit = c("iso3c", "factor"), style = "apsr",
  covariate.labels = c("Black", "Arab", "East Asian",
  "Latin American", "Non-OECD, White", "OECD, White",
  "South Asian", "Southeast Asian",
  "West Asian",
  "Employment Rate", "Year Flow (log)", "Avg. Allele Dist.",
  "Diversity", "(Intercept)"), ci = TRUE,
  digits = 2, align = TRUE,
  add.lines = list(
    c("Country FE", "\\checkmark", "\\checkmark", "\\checkmark")),
  dep.var.labels = "Deviation",
  ci.level = 0.95,
  star.cutoffs = c(0.05, 0.01, 0.001))